library(mosaic)
library(tidyverse)
library(lubridate)
library(DataComputing)
library(rvest)
library(broom)
As COVID-19 spreads at an alarming rate, a pressing question at a global scale emerges– what factors of a country contribute to the spread of Coronavirus. We hope to analyze the relationship between a country’s population level, population density, and continent categorization on the spread of COVID-19.
COVID <- read.csv(file = "total-covid-cases-deaths-per-million.csv")
COVID
COVID %>%
nrow()
[1] 9487
COVID %>%
names()
[1] "total.covid.cases.deaths.per.million" "X"
[3] "X.1" "X.2"
[5] "X.3" "X.4"
[7] "X.5" "X.6"
[9] "X.7" "X.8"
[11] "X.9" "X.10"
[13] "X.11" "X.12"
[15] "X.13" "X.14"
[17] "X.15" "X.16"
[19] "X.17" "X.18"
[21] "X.19" "X.20"
[23] "X.21" "X.22"
[25] "X.23" "X.24"
[27] "X.25" "X.26"
[29] "X.27" "X.28"
[31] "X.29" "X.30"
[33] "X.31" "X.32"
[35] "X.33" "X.34"
[37] "X.35" "X.36"
[39] "X.37" "X.38"
[41] "X.39" "X.40"
[43] "X.41" "X.42"
[45] "X.43" "X.44"
[47] "X.45" "X.46"
[49] "X.47" "X.48"
[51] "X.49" "X.50"
[53] "X.51" "X.52"
[55] "X.53" "X.54"
[57] "X.55" "X.56"
[59] "X.57" "X.58"
[61] "X.59" "X.60"
[63] "X.61" "X.62"
[65] "X.63" "X.64"
[67] "X.65" "X.66"
[69] "X.67" "X.68"
[71] "X.69" "X.70"
[73] "X.71" "X.72"
[75] "X.73" "X.74"
[77] "X.75" "X.76"
[79] "X.77" "X.78"
[81] "X.79" "X.80"
[83] "X.81" "X.82"
[85] "X.83" "X.84"
[87] "X.85" "X.86"
[89] "X.87" "X.88"
[91] "X.89" "X.90"
[93] "X.91" "X.92"
[95] "X.93" "X.94"
[97] "X.95" "X.96"
[99] "X.97" "X.98"
[101] "X.99" "X.100"
[103] "X.101" "X.102"
[105] "X.103" "X.104"
[107] "X.105" "X.106"
[109] "X.107" "X.108"
[111] "X.109" "X.110"
[113] "X.111" "X.112"
[115] "X.113" "X.114"
[117] "X.115" "X.116"
[119] "X.117" "X.118"
[121] "X.119" "X.120"
[123] "X.121" "X.122"
[125] "X.123" "X.124"
[127] "X.125" "X.126"
[129] "X.127" "X.128"
[131] "X.129" "X.130"
[133] "X.131" "X.132"
[135] "X.133" "X.134"
[137] "X.135" "X.136"
[139] "X.137" "X.138"
[141] "X.139" "X.140"
[143] "X.141" "X.142"
[145] "X.143" "X.144"
[147] "X.145" "X.146"
[149] "X.147" "X.148"
[151] "X.149" "X.150"
[153] "X.151" "X.152"
[155] "X.153" "X.154"
[157] "X.155" "X.156"
[159] "X.157" "X.158"
[161] "X.159" "X.160"
[163] "X.161" "X.162"
[165] "X.163" "X.164"
[167] "X.165" "X.166"
[169] "X.167" "X.168"
[171] "X.169" "X.170"
[173] "X.171" "X.172"
[175] "X.173" "X.174"
[177] "X.175" "X.176"
[179] "X.177" "X.178"
[181] "X.179" "X.180"
[183] "X.181" "X.182"
[185] "X.183" "X.184"
[187] "X.185" "X.186"
[189] "X.187" "X.188"
[191] "X.189" "X.190"
[193] "X.191" "X.192"
[195] "X.193" "X.194"
[197] "X.195" "X.196"
[199] "X.197" "X.198"
[201] "X.199" "X.200"
[203] "X.201" "X.202"
[205] "X.203" "X.204"
[207] "X.205" "X.206"
[209] "X.207" "X.208"
[211] "X.209" "X.210"
[213] "X.211" "X.212"
[215] "X.213" "X.214"
[217] "X.215" "X.216"
[219] "X.217" "X.218"
[221] "X.219" "X.220"
[223] "X.221" "X.222"
[225] "X.223" "X.224"
[227] "X.225" "X.226"
[229] "X.227" "X.228"
[231] "X.229" "X.230"
[233] "X.231" "X.232"
[235] "X.233" "X.234"
[237] "X.235" "X.236"
[239] "X.237" "X.238"
[241] "X.239" "X.240"
[243] "X.241" "X.242"
[245] "X.243" "X.244"
[247] "X.245" "X.246"
[249] "X.247" "X.248"
[251] "X.249" "X.250"
[253] "X.251" "X.252"
[255] "X.253" "X.254"
COVID %>%
head()
CountryData
CountryData %>%
nrow()
[1] 256
CountryData %>%
names()
[1] "country" "area" "pop" "growth"
[5] "birth" "death" "migr" "maternal"
[9] "infant" "life" "fert" "health"
[13] "HIVrate" "HIVpeople" "HIVdeath" "obesity"
[17] "underweight" "educ" "unemploymentYouth" "GDP"
[21] "GDPgrowth" "GDPcapita" "saving" "indProd"
[25] "labor" "unemployment" "family" "tax"
[29] "budget" "debt" "inflation" "discount"
[33] "lending" "narrow" "broad" "credit"
[37] "shares" "balance" "exports" "imports"
[41] "gold" "externalDebt" "homeStock" "abroadStock"
[45] "elecProd" "elecCons" "elecExp" "elecImp"
[49] "elecCap" "elecFossil" "elecNuc" "elecHydro"
[53] "elecRenew" "oilProd" "oilExp" "oilImp"
[57] "oilRes" "petroProd" "petroCons" "petroExp"
[61] "petroImp" "gasProd" "gasCons" "gasExp"
[65] "gasImp" "gasRes" "mainlines" "cell"
[69] "netHosts" "netUsers" "airports" "railways"
[73] "roadways" "waterways" "marine" "military"
CountryData %>%
head()
countryRegions
Error: object 'countryRegions' not found
rr countryRegions %>% nrow()
[1] 254
rr countryRegions %>% names()
[1] \ISO3\ \ADMIN\ \REGION\ \continent\ \GEO3major\ \GEO3\ \IMAGE24\ \GLOCAF\
[9] \Stern\ \SRESmajor\ \SRES\ \GBD\ \AVOIDnumeric\ \AVOIDname\ \LDC\ \SID\
[17] \LLDC\
rr countryRegions %>% head()
rr COVID
Since our analysis is focused on the spread of COVID-19, we select only columns which pertain to the number of COVID-19 cases in countries over time.
rr TidyCOVID <- COVID %>% rename(country = total.covid.cases.deaths.per.million ) %>% rename( Code = X ) %>% rename(date = X.1 ) %>% rename(casesPerMillion = X.3) %>% filter(row_number() > 1) %>% subset(select = c(1,2,3,5)) %>% mutate( country = as.character(country) ) %>% mutate(date = mdy(date)) %>% mutate(casesPerMillion = as.integer(casesPerMillion) - 1)
rr TidyCOVID
EVELYN pls explain what an instance represents
We will extract the ISO3 country code and continent from the countryRegions data. Since naming conventions of countries is variate, the ISO3 country code allows us a standardized demarcation of country with which to join with other data tables.
rr Labels <- countryRegions %>% subset(select = c(3, )) %>% rename(continent = REGION) Labels
We will select the aspects of CountryData relevant to our analysis. These attributes are: area (sq km) and pop (number of people).
rr RelevantCountryData <- CountryData %>% subset(select = c(1,2,3)) %>% mutate(popdensity = pop/area) RelevantCountryData
Calculate the number of cases in each country by multiplying casesPerMillion by the country’s population (in millions).
rr COVIDGrowth <- inner_join(TidyCOVID, RelevantCountryData, by = c()) %>% mutate( = (casesPerMillion * round(pop/1000000, digits = 0))) COVIDGrowth <- COVIDGrowth %>% left_join(Labels, by = c( = 3))
Column `Code`/`ISO3` joining factor and character vector, coercing into character vector
rr COVIDGrowth
This table records the first date that a country recorded a nonzero number of COVID-19 cases. This datagraph will help us visualize when countries first became infected.
rr FirstInstance <- COVIDGrowth %>% filter(cases != 0) %>% group_by(country, continent) %>% summarise(beginningofspread = min(date))
FirstInstance
This table averages the number of case increase per day from the first day a country had COVID-19 to the most recent in the data table (April 5 2020)
rr DailySpread <- left_join(COVIDGrowth, FirstInstance, by = c()) %>% filter(date == \2020-04-05) %>% mutate(dayselapsed = date - beginningofspread) %>% mutate(dailyspread = cases / as.numeric(dayselapsed) ) %>% mutate(dailyspreadpermillion = casesPerMillion / as.numeric(dayselapsed) ) %>% subset(select = c(, , , )) DailySpread\(dailyspread[is.na(DailySpread\)dailyspread)] <- 0 DailySpread\(dailyspreadpermillion[is.na(DailySpread\)dailyspreadpermillion)] <- 0 DailySpread
rr COVIDFinal <- left_join(COVIDGrowth, DailySpread, by = c())
rr COVIDFinal
rr COVIDFinal %>% group_by(date) %>% summarise(totalcases = sum(cases)) %>% ggplot(aes(x = date, y = totalcases)) + geom_point() + xlab() + ylab(-19 Cases)
This graph shows the exponential growth trend that we already knew existed with COVID-19. This confirms that our data is valid in that it accurately represents the trends that have been described by researchers and scientists in the media.
rr na.omit(COVIDFinal) %>% group_by(date, continent) %>% summarise(totalcases = sum(cases)) %>% ggplot(aes(x = date, y = totalcases)) + geom_point() + facet_wrap(~continent) + xlab() + ylab(-19 Cases)
This graph shows the growth in cases by continent. The trend is much stronger in the origin continent of Asia, but also shows strength growing in Europe, Africa, North America, and South America.
rr na.omit(FirstInstance) %>% ggplot(aes(x = beginningofspread, fill = continent)) + geom_dotplot(stackgroups = TRUE, binwidth = 1, binpositions=) + xlab(’s First Case of COVID-19) + theme(panel.background = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.title.y = element_blank())
This graph shows the progression of the COVID-19 spread across continents. Asia was obviously the first country to have cases, but North America and Europe quickyl followed. South America and Africa both joined in late February, whereas Australia was able to isolate themselves until mid to late March.
rr
COVIDFinal %>% group_by(country) %>% summarise(dailyspread = mean(dailyspread)) %>% arrange(desc(dailyspread)) %>% head(20) %>% ggplot(aes(x = reorder(country, desc(dailyspread)), y= dailyspread)) + geom_bar(stat=, position = ‘stack’, width=.9) + theme(axis.text.x=element_text(angle = 60, hjust = 1)) + scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) + ylab(Number Infected Per Day) + theme(axis.title.x = element_blank())
According to this graph, the countries with the top four infection rates are China, India, Indonesia, and the United States, closely followed by Brazil. Keep in mind that this is adjusted for the population, so in this graph, population is not a confounding factor. ### Compare this to which countries have the highest populations
rr COVIDFinal %>% group_by(country) %>% summarise(pop = mean(pop)) %>% arrange(desc(pop)) %>% head(20) %>% ggplot(aes(x = reorder(country, desc(pop)), y= pop)) + geom_bar(stat=, position = ‘stack’, width=.9) + theme(axis.text.x=element_text(angle = 60, hjust = 1)) + scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) + ylab() + theme(axis.title.x = element_blank())
We see a similar trend here, in that the top four (in slightly different order) consist of China, India, Indonesia, the United States, and closely trailing Brazil.
rr na.omit(COVIDFinal) %>% ggplot(aes(x = pop, y = dailyspread, color = continent)) + geom_point() + xlab(of Country) + ylab(Number Infected Per Day)
Here, we can see again the trend that was represented in the two previous graphs, but now they are all in the same data frame. While most of the rest of the world trails with under 10,000 infected per day, the 5 countries with the highest popualtion in the data set have over 15,000 - up to over 50,000- and are far separated from the rest of the pack. Population, while not a direct factor contributing to the level of development of a country, is a decent indicator of the rate of infection.
rr na.omit(COVIDFinal) %>% ggplot(aes(x = pop, y = dailyspread, color = continent)) + geom_point() + xlim(0,500000000) + ylim(0, 40000) + xlab(of Country) + ylab(Number Infected Per Day) + stat_smooth(method = lm)
rr
COVIDFinal %>% group_by(country) %>% summarise(dailyspreadpermillion = mean(dailyspreadpermillion)) %>% arrange(desc(dailyspreadpermillion)) %>% head(20) %>% ggplot(aes(x = reorder(country, desc(dailyspreadpermillion)), y= dailyspreadpermillion)) + geom_bar(stat=, position = ‘stack’, width=.9) + theme(axis.text.x=element_text(angle = 60, hjust = 1)) + scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) + ylab(Per Million Infected Per Day) + theme(axis.title.x = element_blank())
rr
COVIDFinal %>% group_by(country) %>% summarise(popdensity = mean(popdensity)) %>% arrange(desc(popdensity)) %>% head(20) %>% ggplot(aes(x = reorder(country, desc(popdensity)), y= popdensity)) + geom_bar(stat=, position = ‘stack’, width=.9) + theme(axis.text.x=element_text(angle = 60, hjust = 1)) + scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) + ylab(Density (people/sq km)) + theme(axis.title.x = element_blank())
rr na.omit(COVIDFinal) %>% ggplot(aes(x = popdensity, y = dailyspreadpermillion)) + geom_point()
rr na.omit(COVIDFinal) %>% ggplot(aes(x = popdensity, y = dailyspreadpermillion)) + geom_point() + facet_wrap(~continent) + xlim(0,1500)
evelyn pls write a conclusion here… something about there being a correlation btwn population and spread, but once standardized, the correlation is far less evident… we can not prove a correlation between population density and infection rate/million.
also i defined this function (because we need a user defined function and to use wide/narrow form), but unsure exactly where to put it,, lmk if u think of a good place.
rr WideCountries <- COVIDFinal %>% subset(select = c(,
)) %>% spread(key = date, value = cases) WideCountries[is.na(WideCountries)] <- 0 WideCountries
rr compareCOVID <- function(countryA, countryB) {
A <-
WideCountries %>%
filter(country == countryA)
B <- WideCountries %>% filter(country == countryB) A <- A %>% gather(key = date, value = count) %>% filter(row_number() > 1) %>% mutate(date = lubridate::ymd(date)) %>% mutate(count = as.numeric(count)) %>% mutate(country = countryA)
B <- B %>% gather(key = date, value = count) %>% filter(row_number() > 1) %>% mutate(date = lubridate::ymd(date))%>% mutate(count = as.numeric(count)) %>% mutate(country = countryB)
GG <- rbind(A,B)
return( ggplot(GG, aes(x = date, y = count, color = country)) + stat_smooth(formula = y ~ x, method = ) + ylab(of COVID-19 Cases) + xlab())
}
rr compareCOVID(, States)
rr compareCOVID(, )
rr compareCOVID(Rico, )